In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import datetime
import glob
import os
from collections import OrderedDict
import scipy.io
%matplotlib inline
%load_ext rpy2.ipython

Load python R interface and import coda for computing chain statistics


In [ ]:
import rpy2.interactive as r
import rpy2.interactive.packages
r.packages.importr("coda")
rlib = r.packages.packages

Function for correct printing of values to specified number of significant figures


In [ ]:
def to_precision(x, p):
    p_str = str(p)
    fmt_string = '{0:.' + p_str + 'g}'
    return fmt_string.format(x)
# alternative method which properly deals with trailing zeros can be got by uncommenting below
# to load function by Randle Taylor from git URL
# %load https://raw.githubusercontent.com/randlet/to-precision/master/to_precision.py

In [ ]:
exp_dir = os.path.join(os.environ['EXP_DIR'], 'apm_mcmc')

Specify file pattern for saved results for different data set and method combination. Ordered dict used so that order is maintained in printed LaTeX table


In [ ]:
file_name_pattern_map = OrderedDict([
    (('Pima', 'PM MH'), '*pima_pmmh_chain_*_results.npz'),
    (('Pima', 'APM MI+MH'), '*pima_apm(mi+mh)_chain_*_results.npz'),
    (('Pima', 'APM SS+MH'), '*pima_apm(ss+mh)_chain_*_results.npz'),
    (('Pima', 'APM SS+SS'), '*pima_apm(ess+rdss)_chain_*_results.npz'),
    (('Breast', 'PM MH'), '*breast_pmmh_chain_*_results.npz'),
    (('Breast', 'APM MI+MH'), '*breast_apm(mi+mh)_chain_*_results.npz'),
    (('Breast', 'APM SS+MH'), '*breast_apm(ss+mh)_chain_*_results.npz'),
    (('Breast', 'APM SS+SS'), '*breast_apm(ess+rdss)_chain_*_results.npz'),
])

Load up saved chains and run stats and store in another ordered dict. Also compute effective sample size and Gelman-Rubin R stat for chains at this point using R-coda interface


In [ ]:
results_map = OrderedDict()
for (data_set, method), file_name_pattern in file_name_pattern_map.items():
    file_list = glob.glob(os.path.join(exp_dir, file_name_pattern))
    chains = []
    chains_stats = []
    for file_path in file_list:
        results = np.load(file_path)
        chains.append(results['thetas'])
        chains_stats.append(results['n_reject_n_cubic_ops_comp_time'])
    chains = np.array(chains)
    chains_stats = np.array(chains_stats)
    n_effs = np.empty((chains.shape[0], 2))
    for i, chain in enumerate(chains):
        n_effs[i, 0] = rlib.coda.effectiveSize(chain[:, 0])[0]
        n_effs[i, 1] = rlib.coda.effectiveSize(chain[:, 1])[0]
    r_chains_list = rlib.coda.as_mcmc_list([rlib.coda.as_mcmc(chain) for chain in chains[:, :, :]])
    gelman_rubin = rlib.coda.gelman_diag(r_chains_list, autoburnin=False)
    results_map[(data_set, method)] = (chains, chains_stats, n_effs, gelman_rubin)

In [ ]:
prc_mn = 3 # precision to report means with
prc_se = 2 # precision to report standard errors with
max_n_chains = 0 # will be populated with max n chains to allow proper 
                 # formatting of autocorr plots later for cases when
                 # plotting intermediate results with differing number
                 # of chains completed per method / data set
# header for LaTeX table of results
latex_table = ''
latex_table += ' & Method & $N_\\text{cub.cop}$ & Acc. rate '
latex_table += '& $N_\\text{eff}$ & $\\frac{N_\\text{eff}}{N_\\text{cub.op}}$ & $\\hat{R}$ '
latex_table += '& $N_\\text{eff}$ & $\\frac{N_\\text{eff}}{N_\\text{cub.op}}$ & $\\hat{R}$ '
latex_table += '\\\\ \n \hline \n'
for (data_set, method), (chains, chains_stats, n_effs, gelman_rubin) in results_map.items():
    n_chains, n_samples, n_param = chains.shape
    max_n_chains = max(max_n_chains, n_chains) # update record of maximum no. chains
    # second last column of chain stats is number of cubic operations for a run
    # for display purposes, divide by 1000 as easier to visually compare without
    # scientific notation
    # possibly two reject rates (for u|theta and theta|u updates) present so index
    # chain_stats from end rather than start to make consistent
    n_kcops = chains_stats[:, -2] / 1000. 
    # calculate various mean stats over chains and their associated statndard errors
    mean_n_k_cub_ops = n_kcops.mean()
    ster_n_k_cub_ops = n_kcops.std(ddof=1) / n_chains**0.5
    mean_n_eff_samps = n_effs.mean(0)
    ster_n_eff_samps = n_effs.std(0, ddof=1) / n_chains**0.5
    mean_es_per_kcop = (n_effs / n_kcops[:, None]).mean(0)
    ster_es_per_kcop = (n_effs / n_kcops[:, None]).std(0, ddof=1) / n_chains**0.5
    # third column from end contains reject rate for theta|u updates
    # often will be first column however sometimes reject rate for u|theta updates
    # present as first column
    acc_rates = 1. - chains_stats[:, -3] * 1. / n_samples
    mean_accept_rate = acc_rates.mean()
    ster_accept_rate = acc_rates.std(0, ddof=1) / n_chains**0.5
    # add row for current results to LaTeX table
    latex_table += ' & \sc {0} & {1} ({2}) & {3} ({4})\n'.format(
        method.lower(), 
        to_precision(mean_n_k_cub_ops, prc_mn), 
        to_precision(ster_n_k_cub_ops, prc_se),
        to_precision(mean_accept_rate, prc_mn), 
        to_precision(ster_accept_rate, prc_se)
    )
    latex_table += ' & {0} ({1}) & {2} ({3}) & {4}\n'.format(
        to_precision(mean_n_eff_samps[0], prc_mn), 
        to_precision(ster_n_eff_samps[0], prc_se),
        to_precision(mean_es_per_kcop[0], prc_mn), 
        to_precision(ster_es_per_kcop[0], prc_se),
        to_precision(gelman_rubin[0][0], prc_mn),
    )
    latex_table += ' & {0} ({1}) & {2} ({3}) & {4}'.format(
        to_precision(mean_n_eff_samps[1], prc_mn), 
        to_precision(ster_n_eff_samps[1], prc_se),
        to_precision(mean_es_per_kcop[1], prc_mn), 
        to_precision(ster_es_per_kcop[1], prc_se),
        to_precision(gelman_rubin[0][1], prc_mn),
    )
    latex_table += ' \\\\ \n'
    # Print space delimited table of results for quick checking
    print('-' * 55)
    print('Data set: {0: <8}  Method: {1: <10}  # chains: {2}'
          .format(data_set, method, n_chains))
    print('-' * 55)
    print('    mean num. k cubic op.            {0: <6} ({1})'
          .format(to_precision(mean_n_k_cub_ops, prc_mn), 
                  to_precision(ster_n_k_cub_ops, prc_se)))
    print('    effective sample size  (sigma)   {0: <6} ({1})'
          .format(to_precision(mean_n_eff_samps[0], prc_mn), 
                  to_precision(ster_n_eff_samps[0], prc_se)))
    print('    effective sample size  (tau)     {0: <6} ({1})'
          .format(to_precision(mean_n_eff_samps[1], prc_mn), 
                  to_precision(ster_n_eff_samps[1], prc_se)))
    print('    eff. samp. / cubic op. (sigma)   {0: <6} ({1})'
          .format(to_precision(mean_es_per_kcop[0], prc_mn), 
                  to_precision(ster_es_per_kcop[0], prc_se)))
    print('    eff. samp. / cubic op. (tau)     {0: <6} ({1})'
          .format(to_precision(mean_es_per_kcop[1], prc_mn), 
                  to_precision(ster_es_per_kcop[1], prc_se)))
    print('    Gelman-Rubin statistic (sigma)   {0}'
          .format(to_precision(gelman_rubin[0][0], prc_mn)))
    print('    Gelman-Rubin statistic (tau)     {0}'
          .format(to_precision(gelman_rubin[0][1], prc_mn)))
    print('    n acc rates off-target           {0}'
          .format(np.sum((acc_rates < 0.15) + (acc_rates > 0.30))))

Print LaTeX table rows for inclusion in paper


In [ ]:
print(latex_table)

Save all chains for different method / dataset / variate combinations to a MATLAB readable file to allow loading results there to plot autocorrelations in same style as other figures


In [ ]:
n_chains = 10
n_samples = 10000
n_methods = len(file_name_pattern_map) / 2
pima_sigma_chains = np.empty((n_chains, n_samples, n_methods))
pima_tau_chains = np.empty((n_chains, n_samples, n_methods))
breast_sigma_chains = np.empty((n_chains, n_samples, n_methods))
breast_tau_chains = np.empty((n_chains, n_samples, n_methods))
pima_comp_costs = np.empty(n_methods)
breast_comp_costs = np.empty(n_methods)
pima_method_names = []
breast_method_names = []
m, n = 0, 0
for (data_set, method), (chains, chains_stats, n_effs, gelman_rubin) in results_map.items():
    if data_set.lower() == 'pima':
        pima_sigma_chains[:, :, m] = chains[:, -n_samples:, 0]
        pima_tau_chains[:, :, m] = chains[:, -n_samples:, 1]
        pima_method_names.append(method)
        pima_comp_costs[m] = chains_stats[:, -2].mean()
        m += 1
    elif data_set.lower() == 'breast':
        breast_sigma_chains[:, :, n] = chains[:, -n_samples:, 0]
        breast_tau_chains[:, :, n] = chains[:, -n_samples:, 1]
        breast_method_names.append(method)
        breast_comp_costs[n] = chains_stats[:, -2].mean()
        n += 1
pima_rel_comp_costs = pima_comp_costs / pima_comp_costs[0]
breast_rel_comp_costs = breast_comp_costs / breast_comp_costs[0]
time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S_')
scipy.io.savemat(os.path.join(exp_dir, time_stamp + 'chains_matlab_dump.mat'),
                 {
                     'pima_sigma_chains' : pima_sigma_chains,
                     'pima_tau_chains' : pima_tau_chains,
                     'breast_sigma_chains' : breast_sigma_chains,
                     'breast_tau_chains' : breast_tau_chains,
                     'pima_rel_comp_costs' : pima_rel_comp_costs,
                     'breast_rel_comp_costs' : breast_rel_comp_costs,
                     'pima_method_names' : pima_method_names,
                     'breast_method_names' : breast_method_names
                 }
)

Plot autocorrelation plots for all chains - if lots of chains loaded will be a large figure so best viewed externally


In [ ]:
thin_factor = 10
max_lag = 30
fig = plt.figure(figsize=(40, 16))
n_dm = len(results_map)
for i, ((data_set, method), (chains, chains_stats, n_effective, gelman_rubin)) in enumerate(results_map.items()):
    for j, chain in enumerate(chains):
        ax_tau = fig.add_subplot(max_n_chains, 2 * n_dm, j * 2 * n_dm + 1 + 2 * i % 60)
        ax_sig = fig.add_subplot(max_n_chains, 2 * n_dm, j * 2 * n_dm + 2 + 2 * i % 60)
        x_tau = chain[::thin_factor, 0].copy()
        x_tau -= x_tau.mean()
        autocorr_tau = np.correlate(x_tau, x_tau, mode=2)[x_tau.size:]
        autocorr_tau /= autocorr_tau[0]
        x_sig = chain[::thin_factor, 1].copy()
        x_sig -= x_sig.mean()
        autocorr_sig = np.correlate(x_sig, x_sig, mode=2)[x_sig.size:]
        autocorr_sig /= autocorr_sig[0]
        ax_tau.vlines(np.arange(max_lag) + 1, 0., autocorr_tau[:max_lag])
        ax_tau.axhline()
        ax_tau.set_yticks(np.linspace(-0.4, 0.8, 4))
        #ax_tau.set_xticks(np.arange(0, 31, 10))
        ax_sig.vlines(np.arange(max_lag) + 1, 0., autocorr_sig[:max_lag])
        ax_sig.axhline()
        #ax_sig.set_xticks(np.arange(0, 31, 10))
        ax_sig.set_yticks(np.linspace(-0.4, 0.8, 4))
        if j == 0:
            ax_tau.set_title('{0} $\\tau$'.format(data_set + ', ' + method))
            ax_sig.set_title('{0} $\\sigma$'.format(data_set + ', ' + method))
fig.tight_layout()

Calculate mean compute time across 10 chains for PM MH method and APM SS+MH method (for runs on the same machine) to verify that extra quadratic operations for APM approaches here are a negligible overhead


In [ ]:
for (data_set, method), (chains, chains_stats, n_effs, gelman_rubin) in results_map.items():
    if data_set == 'Pima':
        if method == 'PM MH' or method == 'APM SS+MH':
            print('{0} {1} mean compute time: {2} +/- {3}'.format(
                data_set, method,
                to_precision(chains_stats[:, -1].mean(), 3),
                to_precision(chains_stats[:, -1].std(ddof=1) / chains.shape[0]**0.5, 2))